1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.lucene.analysis.pattern;
19  
20  import java.io.IOException;
21  import java.io.StringReader;
22  import java.util.ArrayList;
23  import java.util.List;
24  import java.util.regex.Pattern;
25  
26  import org.apache.lucene.analysis.Analyzer;
27  import org.apache.lucene.analysis.BaseTokenStreamTestCase;
28  import org.apache.lucene.analysis.CharFilter;
29  import org.apache.lucene.analysis.TokenStream;
30  import org.apache.lucene.analysis.Tokenizer;
31  import org.apache.lucene.analysis.charfilter.MappingCharFilter;
32  import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
33  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34  
35  public class TestPatternTokenizer extends BaseTokenStreamTestCase 
36  {
37    public void testSplitting() throws Exception 
38    {
39      String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
40      String[][] tests = {
41        // group  pattern        input                    output
42        { "-1",   "--",          "aaa--bbb--ccc",         "aaa bbb ccc" },
43        { "-1",   ":",           "aaa:bbb:ccc",           "aaa bbb ccc" },
44        { "-1",   "\\p{Space}",  "aaa   bbb \t\tccc  ",   "aaa bbb ccc" },
45        { "-1",   ":",           "boo:and:foo",           "boo and foo" },
46        { "-1",   "o",           "boo:and:foo",           "b :and:f" },
47        { "0",    ":",           "boo:and:foo",           ": :" },
48        { "0",    qpattern,      "aaa 'bbb' 'ccc'",       "'bbb' 'ccc'" },
49        { "1",    qpattern,      "aaa 'bbb' 'ccc'",       "bbb ccc" }
50      };
51      
52      for( String[] test : tests ) {     
53        TokenStream stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile(test[1]), Integer.parseInt(test[0]));
54        ((Tokenizer)stream).setReader(new StringReader(test[2]));
55        String out = tsToString( stream );
56        // System.out.println( test[2] + " ==> " + out );
57  
58        assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
59        
60        // Make sure it is the same as if we called 'split'
61        // test disabled, as we remove empty tokens
62        /*if( "-1".equals( test[0] ) ) {
63          String[] split = test[2].split( test[1] );
64          stream = tokenizer.create( new StringReader( test[2] ) );
65          int i=0;
66          for( Token t = stream.next(); null != t; t = stream.next() ) 
67          {
68            assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
69          }
70        }*/
71      } 
72    }
73  
74    public void testOffsetCorrection() throws Exception {
75      final String INPUT = "Günther Günther is here";
76  
77      // create MappingCharFilter
78      List<String> mappingRules = new ArrayList<>();
79      mappingRules.add( "\"&uuml;\" => \"ü\"" );
80      NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
81      builder.add("&uuml;", "ü");
82      NormalizeCharMap normMap = builder.build();
83      CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
84  
85      // create PatternTokenizer
86      Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1);
87      stream.setReader(charStream);
88      assertTokenStreamContents(stream,
89          new String[] { "Günther", "Günther", "is", "here" },
90          new int[] { 0, 13, 26, 29 },
91          new int[] { 12, 25, 28, 33 },
92          INPUT.length());
93      
94      charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
95      stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0);
96      stream.setReader(charStream);
97      assertTokenStreamContents(stream,
98          new String[] { "Günther", "Günther" },
99          new int[] { 0, 13 },
100         new int[] { 12, 25 },
101         INPUT.length());
102   }
103   
104   /** 
105    * TODO: rewrite tests not to use string comparison.
106    */
107   private static String tsToString(TokenStream in) throws IOException {
108     StringBuilder out = new StringBuilder();
109     CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
110     // extra safety to enforce, that the state is not preserved and also
111     // assign bogus values
112     in.clearAttributes();
113     termAtt.setEmpty().append("bogusTerm");
114     in.reset();
115     while (in.incrementToken()) {
116       if (out.length() > 0)
117         out.append(' ');
118       out.append(termAtt.toString());
119       in.clearAttributes();
120       termAtt.setEmpty().append("bogusTerm");
121     }
122 
123     in.close();
124     return out.toString();
125   }
126   
127   /** blast some random strings through the analyzer */
128   public void testRandomStrings() throws Exception {
129     Analyzer a = new Analyzer() {
130       @Override
131       protected TokenStreamComponents createComponents(String fieldName) {
132         Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1);
133         return new TokenStreamComponents(tokenizer);
134       }    
135     };
136     checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
137     a.close();
138     
139     Analyzer b = new Analyzer() {
140       @Override
141       protected TokenStreamComponents createComponents(String fieldName) {
142         Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0);
143         return new TokenStreamComponents(tokenizer);
144       }    
145     };
146     checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
147     b.close();
148   }
149 
150   // LUCENE-6814
151   public void testHeapFreedAfterClose() throws Exception {
152     // TODO: can we move this to BaseTSTC to catch other "hangs onto heap"ers?
153 
154     // Build a 1MB string:
155     StringBuilder b = new StringBuilder();
156     for(int i=0;i<1024;i++) {
157       // 1023 spaces, then an x
158       for(int j=0;j<1023;j++) {
159         b.append(' ');
160       }
161       b.append('x');
162     }
163 
164     String big = b.toString();
165 
166     Pattern x = Pattern.compile("x");
167 
168     List<Tokenizer> tokenizers = new ArrayList<>();
169     for(int i=0;i<512;i++) {
170       Tokenizer stream = new PatternTokenizer(x, -1);
171       tokenizers.add(stream);
172       stream.setReader(new StringReader(big));
173       stream.reset();
174       for(int j=0;j<1024;j++) {
175         assertTrue(stream.incrementToken());
176       }
177       assertFalse(stream.incrementToken());
178       stream.end();
179       stream.close();
180     }
181   }
182 }